############################################
# Prepare data for model fitting
# Author: Haohan Chen
############################################

# Load data
import pandas as pd
import os
import sys

def prepare_data(label_select, subset, random_seed = 1):
    # Load codebook
    d_codebook = pd.read_csv(PATH_CODEBOOK)
    d_codebook['cat_label'] = d_codebook['category'] + ' - ' + d_codebook['label']
    codebook = [d_codebook.query(f"codebook_id == {lab}")['cat_label'].values[0] for lab in label_select]
    # Load coded tweets
    d = pd.read_csv(f'{PATH_DATA}/coded_label_wide_experiment.csv')
    # Get a dataset of unique tweet_id_str
    tweet_id_str = d.tweet_id_str.unique()
    # Shuffle tweet_id_str (to avoid original data order of ID affecting the random split)
    np.random.seed(random_seed)
    np.random.shuffle(tweet_id_str)
    n = len(tweet_id_str)
    n_train = int(n * 0.8)
    n_val = n - n_train
    train_id = tweet_id_str[:n_train]
    val_id = tweet_id_str[n_train:]

    # Split d into training and test set
    d_tr = d[d.tweet_id_str.isin(train_id)]
    d_va = d[d.tweet_id_str.isin(val_id)]

    # For the training set, subset depending on the required subset
    if subset == 'txt':
        d_tr = d_tr[d_tr['treatment'] == 0]
    else:
        d_tr = d_tr[d_tr['treatment'] == 1]
    # Bootstrap the training data
    d_tr = d_tr.sample(frac = 1, replace = True, random_state = random_seed)

    # For the validation set, always subset the treated group
    d_va = d_va[d_va['treatment'] == 1]

    d_tr['labels'] = list(d_tr[label_select].values)
    d_tr = d_tr[['tweet_id_str', 'text', 'labels']].copy()
    d_va['labels'] = list(d_va[label_select].values)
    d_va = d_va[['tweet_id_str', 'text', 'labels']].copy()

    return d_tr, d_va, codebook
